import random

from bs4 import BeautifulSoup
import requests

from app.db.db import Article, db_insert_article

s = requests.Session()  # 生成Session对象，用于保存Cookie

ARTICLE_FILE_PATH = 'D:\\studyProject\\python\\blog\\article.txt'  # 存放文章的文件
TEMP_FILE_PATH = 'D:\\studyProject\\python\\blog\\temp.txt'  # 存放单个文章

HEADERS = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/87.0.4280.141 Safari/537.36"
}


def get_article_pic(pic_elements, index):
    """
    根据索引获取文章封面路径
    :param pic_elements:
    :param index:
    :return:
    """
    i = 0
    for pic in pic_elements:
        if index == i:
            pic_src = 'https:' + pic.get('original')
            return pic_src
        i += 1


def get_article_title(title_elements, index):
    """
    根据索引获取文章标题内容
    :param title_elements:
    :param index:
    :return:
    """
    i = 0
    for title in title_elements:
        if index == i:
            title_content = title.string.strip()
            return title_content
        i += 1


def get_article_keyword(keyword_elements, index):
    """
    根据索引获取文章的关键字
    :param keyword_elements:
    :param index:
    :return:
    """
    i = 0
    for keyword in keyword_elements:
        if index == i:
            tag = keyword.next_sibling.next_sibling.string.strip()
            print(tag)
            return tag
        i += 1


def get_comment_view(vc):
    res = 0
    if vc == 0:
        res = random.randint(200, 1000)
    else:
        res = random.randint(0, 5)
    return res


def get_article_content(link):
    """
    根据链接获取文章的发布时间和内容
    :param link:
    :return:返回文章对象
    """
    r = s.get(link)
    r.raise_for_status()
    r.encoding = 'utf-8'
    a_soup = BeautifulSoup(r.text, 'lxml')
    article_content = a_soup.find_all('article')[0]
    article_time = a_soup.find_all('span', class_='time')[0].string.split(' ')[0]
    article = Article()
    article.content = str(article_content)
    article.addtime = article_time
    return article


def get_article_desc(desc_elements, index):
    """
    返回文章描述
    :param desc_elements:
    :param index:
    :return:
    """
    i = 0
    for desc in desc_elements:
        if index == i:
            desc_content = desc.string.replace("\n", "").strip()
            return desc_content
        i += 1


def storage_article_text(url):
    """
    根据文章列表页面分析数据
    :param:url
    :return:
    """
    # article_list_url = 'https://mp.sohu.com/profile?xpt=emhhbmhsZXBAc29odS5jb20=&_f=index_pagemp_1&spm=smpc.content' \
    #                    '.author.1.1610841891253tbFNvG2'
    r = s.get(url, headers=HEADERS)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'lxml')
    pic_list = soup.find_all('img', class_='cover-pic')  # 爬取的文章封面图片
    title_list = soup.find_all('a', attrs={'data-spm-type': 'content'})  # 爬取文章的标题
    keyword_list = soup.find_all('i', class_='mp-iconqietu-biaoqian')  # 文章的关键字
    desc_list = soup.find_all('p', class_='feed-brief')  # 文章的关键字描述
    index = 0
    for link in title_list:
        link_content = 'https:' + link['href']
        pic = get_article_pic(pic_list, index)
        title = get_article_title(title_list, index)
        keyword = get_article_keyword(keyword_list, index)
        desc = get_article_desc(desc_list, index)
        views = get_comment_view(0)
        comments = get_comment_view(1)
        article = get_article_content(link_content)
        article.pic = pic
        article.title = title
        article.keyword = keyword
        article.desc = desc
        article.views = views
        article.source = 'https://www.sohu.com'
        article.ischeck = 1
        article.istop = 0
        article.comments = comments
        db_insert_article(article)
        index += 1
        if index > 9:               # 一次最多插入十条记录
            break
